Scatterplots
# check the first 6 lines
head(dat)
## sex age edu occ oexp income
## 1 female 62 low low 35 953
## 2 male 32 high high 6 1224
## 3 male 56 med. high 36 1466
## 4 female 63 med. med. 38 1339
## 5 male 20 low low 3 1184
## 6 female 38 med. med. 12 1196
# summary
summary(dat)
## sex age edu occ oexp
## male :1069 Min. :18.00 low :811 low :595 Min. : 0.0
## female: 853 1st Qu.:31.00 med.:692 med.:700 1st Qu.: 8.0
## Median :42.00 high:419 high:627 Median :19.0
## Mean :41.69 Mean :19.5
## 3rd Qu.:52.00 3rd Qu.:30.0
## Max. :65.00 Max. :48.0
## income
## Min. : 704
## 1st Qu.:1117
## Median :1304
## Mean :1313
## 3rd Qu.:1506
## Max. :2115
- use default functions to draw a scatterplot.
# scatterplot
plot(income ~ oexp, data = dat, cex = .6, xlab = 'Occupational Experience', ylab = 'Monthly Net Income')

- use ggplot to draw a scatterplot, regression smoother, and linear regression line.
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw()

Conditional distribution & expectation
# conditional distribution
plot(income ~ factor(oexp), data = dat, cex = .4, xlab = 'Occupational Experience',
ylab = 'Monthly Net Income', main = 'Conditional Distribution')

# conditional expectation
plot(income ~ oexp, data=dat, cex = .4, xlab = 'Occupational Experience',
ylab = 'Monthly Net Income', main = 'Conditional Mean')
m.vec <- with(dat, tapply(income, oexp, mean)) # creates vector of conditional means
o.vec <- with(dat, sort(unique(oexp))) # determines the unique values of oexp and sorts them
points(o.vec, m.vec, col = 'red', pch = 16, type = 'o', lwd = 2) # plot path of means

# compute conditional quartiles
q50.vec <- with(dat, tapply(income, oexp, median))
q25.vec <- with(dat, tapply(income, oexp, quantile, prob = .25)) # first quartile
q75.vec <- with(dat, tapply(income, oexp, quantile, prob = .75)) # third quartile
plot(income ~ oexp, data=dat, cex = .4, xlab = 'Occup. Experience',
ylab = 'Monthly Net Income', main = 'Conditional Means/Distribution')
points(o.vec, m.vec, col = 'red', pch = 16, type = 'l', lwd = 2) # plot path of means
points(o.vec, q50.vec, col = 'blue', pch = 16, type = 'l', lwd = 2)
points(o.vec, q25.vec, col = 'blue', pch = 16, type = 'l', lwd = 2, lty = 2)
points(o.vec, q75.vec, col = 'blue', pch = 16, type = 'l', lwd = 2, lty = 2)
legend('topleft', c('mean', 'median', '1st quartile', '3rd quartile'),
col = c('red', rep('blue', 3)), lwd = 2, lty = c(1, 1, 2, 2), inset = .05)

# conditional distribution
ggplot(dat, aes(x=oexp, y=income)) + geom_boxplot(aes(group=oexp)) + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title='Conditional Distribution') + theme_bw()

# conditional expectation
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title='Conditional mean') + theme_bw() +
stat_summary(fun = mean, geom = 'point', col = "red", size=2.5) +
stat_summary(fun = mean, geom = 'line', col = "red", size=1)

Binning
# create equi-distant bins using cut() (results in a "factor")
dat$oexpbins <- cut(dat$oexp, breaks = seq(0, 50, by = 5), right = F)
m.vec <- with(dat, tapply(income, oexpbins, mean)) # vector of income means for each bin
o.vec <- with(dat, tapply(oexp, oexpbins, mean)) # vector of oexp means for each bin
# plot scatterplot and path of bin-means
plot(income ~ oexp, data = dat, cex = .4, xlab = 'Occupational Experience',
ylab = 'Monthly Net Income', main = 'Path of Group Means')
points(o.vec, m.vec, type = 'o', col = 'red', lwd = 2) # plot path of means
abline(v = seq(4.5, 50, by = 5), lty = 2) # add bin borders

# plot scatterplot and stepfunction of bin-means
plot(income ~ oexp, data = dat, cex = .4, xlab = 'Occupational Experience',
ylab = 'Monthly Net Income', main = 'Step Function of Group Means')
points(seq(-.5, 50, by = 5), c(m.vec, m.vec[length(m.vec)]), type = 's',
col = 'red', lwd = 3) # type = 's' plots a stepfunction

# create equi-distant bins using cut() (results in a "factor")
dat$oexpbins <- cut(dat$oexp, breaks = seq(0, 50, by = 5), right = F)
bin.stat <- dat %>% group_by(oexpbins) %>%
summarize (m.income = mean(income), m.oexp = mean(oexp), .groups = "drop") # income means and oexp means for each bin
# plot scatterplot and path of bin-means
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title="Path of Group Means") + theme_bw() +
geom_point(data=bin.stat, aes(x=m.oexp, y=m.income), col = "red", size=2.5) +
geom_line(data=bin.stat, aes(x=m.oexp, y=m.income), col = "red", size=1) +
geom_vline(xintercept = seq(0, 50, by = 5), linetype="dotted", size=0.5)

# plot scatterplot and stepfunction of bin-means
bin.stat2 <- cbind(rbind(bin.stat, bin.stat[nrow(bin.stat),]), breaks=seq(0, 50, by = 5))
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title="Step Function of Group Means") + theme_bw() +
geom_step(data=bin.stat2, aes(x=breaks, y=m.income), col = "red", size=1)

Local averaging
- write our own function for local averaging: loc.av()
loc.av <- function(y, x, w)
{
# computes local means of y for each unique value of x
# with a centered/symmetric window of width w
# y ... vector of dependent variable
# x ... vector of independent variable
# w ... width of the window for averaging
x.val <- sort(unique(x)) # determines the unique values of x and sorts them
avrg <- function(y, x, x.loc, w) {
# determines the local mean value at a specific value x.loc
mean(y[x >= x.loc-w/2 & x <= x.loc+w/2])
}
m.vec <- sapply(x.val, avrg, y = y, x = x, w = w) # apply function to each x.val
data.frame(x = x.val, y = m.vec) # return data frame of x values and means
}
out.av <- with(dat, loc.av(income, oexp, w = 5)) # matrix of x and mean values
plot(income ~ oexp, data = dat, cex = .4, xlab = 'Occupational Experience',
ylab = 'Monthly Net Income', main = 'Local Averaging') # plot scatterplot
lines(out.av, col = 'red', lwd = 3) # add path of means obtained from loc.av()
# add path of means resulting from different window choices (10y and 3y)
out.av2 <- with(dat, loc.av(income, oexp, w = 10))
lines(out.av2, col = 'blue', lwd = 2)
out.av3 <- with(dat, loc.av(income, oexp, w = 3))
lines(out.av3, col = 'green', lwd = 2)
legend('topleft', c('window = 3', 'window = 5', 'window = 10'),
col = c('green', 'red', 'blue'), lwd=c(2, 3, 2), inset = .05)

out.av <- with(dat, loc.av(income, oexp, w = 5)) # matrix of x and mean values
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title='Local Averaging') + geom_line(data=out.av, aes(x=x, y=y), col = "red", size=1) + theme_bw()

Kernel estimation
- write our own function for regression smoothing: k.smth()
k.smth <- function(y, x, h)
{
# computes a weighted mean of y for each unique value of x
# with a centered normal kernel
# y ... vector of dependent variable
# x ... vector of independent variable
# h ... bandwidth (standard deviation of the normal kernel)
x.val <- sort(unique(x)) # sorted unique values of x
avrg <- function(y, x, x.loc, h) {
# computes local mean value at x.loc using normal kernel weights
wt <- dnorm((x - x.loc) / h) # weigths
(y %*% wt) / sum(wt) # locally weighted mean; or: weighted.mean(y, wt)
}
m.vec <- sapply(x.val, avrg, y = y, x = x, h = h) # applies avrg() to all x.vals
data.frame(x = x.val, y = m.vec) # returns matrix of x and mean values
}
# bandwidth (standard dev. = 1)
out.smth <- with(dat, k.smth(income, oexp, 1)) # get x and mean values
plot(income ~ oexp, data = dat, cex = .4, xlab = 'Occup. Experience',
ylab = 'Monthly Net Income', main = 'Kernel Estimation') # scatterplot
lines(out.smth, col = 'red', lwd = 3) # add path of means
# add path of means with a larger bandwidth (standard dev. = 3)
out.smth2 <- with(dat, k.smth(income, oexp, 3))
lines(out.smth2, col = 'blue', lwd = 2)
legend('topleft', c('SD = 1', 'SD = 3'),
col = c('red', 'blue'), lwd=c(3, 2), inset = .05)

out.smth <- with(dat, k.smth(income, oexp, 1)) # get x and mean values
ggplot(dat, aes(x=oexp, y=income)) + geom_point() + labs(x = 'Occupational Experience', y = 'Monthly Net Income', title='Kernel Estimation') + geom_line(data=out.smth, aes(x=x, y=y), col = "red", size=1) + theme_bw()
